# Akram Yazdani et al. Novel treatment-specific causal biomarkers for 
# colorectal cancer by omics integration, NAR Genomics and Bioinformatics, 2025. 
# Immune cell type aboundance was estimated using CIBERSORTx. 
# eQTL analysis was performed using FastQT. 
# TCGA data were downloaded from TCGAbiolinks package in R.
# All the analysis were performed using Standard R packages such as survminer, survival, lm.
# We have provided the script for generating instrumental variables (IVs).

library("survival")
library("data.table")
library("amap")
library("cluster")

###########################
mean_imp <- function( x)
{
  if( any(is.na(x))) x[which(is.na(x))] <- mean(x, na.rm = TRUE)
  return(x)
}
###########################
maf = function(x){
  a = table(x == 2)/length(x)
  return(a[2])
}
###########################
fit_snp_os <- function(snp, Y)
{
  snp <-  ifelse( snp == 0, 0, 1 )
  
  snp <- mean_imp(snp)
  data <- data.frame("snp" = snp, Y)
  fit <- coxph( Surv(fu_time, dead) ~ snp + age + SEX_ID + braf_v600e_mut + expanded_RAS + PC1 , data = data)
  return(c(summary(fit)[[7]][1, ], summary(fit)[[8]][1, 3:4]))
  
}

######################################################
##############      Kmean
######################################################

data <- read.table("RNA-seq", sep = "\t", header = TRUE)
sel_gene <- read.table("Genes_eQTL", sep = "\t", header = TRUE)
covar <- read.table("Covar", sep = "\t", header = TRUE)
Gene  <- data[ , c(1, which(colnames(data) %in% colnames(sel_gene)))]

######################################################
adj_fun <- function(gene, covar)
{
  fit <- lm( gene ~ age + Gender + BRAFv600E + expanded_RAS + B.cells.naive + B.cells.memory + Plasma.cells + T.cells.CD8 + T.cells.CD4.memory.resting + 	        		T.cells.CD4.memory.activated + Macrophages.M0 + Macrophages.M2 + Mast.cells.activated, data = covar)
  return(resid(fit))
  
}

adjusted_exp <- cbind.data.frame("IID" = Gene[,1], apply(Gene[ ,-1], 2, adj_fun, covar))
######################################################

clusters = Kmeans(x = t(adjusted_exp[ , -1]) , centers = 4, method = "pearson", iter.max = 100, nstart = 25)


gene_class <- list()
for( i in 1:4)
{
  gene_class[[i]]  <- adjusted_exp[ , c(1, which(colnames(adjusted_exp) %in% names(clusters$cluster[clusters$cluster == i]) ) ) ]
}
######################################################
##########        gene-OS analysis
######################################################

eqtl<- read.table("eQTL", sep="\t", header = TRUE)

y <- covar[ , colnames(covar) %in% c("fu_time", "dead")]

y_1 <- y[covar$arm == 1, ]
y_2 <- y[covar$arm == 2, ]

covar_1 <- covar[covar$arm == 1, ]
covar_2 <- covar[covar$arm == 2, ]


Res<- list()

for( i in 1:4) 
{ 
  print(i)        
  gene_class[[i]] 
  
  x_1 <- Gene[ Gene$ID %in% covar$ID[covar$arm == 1],  ]
  x_2 <- Gene[ Gene$ID %in% covar$ID[covar$arm == 2],  ]
  
  data_1 <- list(y_1, covar_1, x_1)
  data_2 <- list(y_2, covar_2, x_2)
  
  Data_1 <- Reduce(function(x, y) merge(x, y, by = "ID"), data_1)	
  Data_2 <- Reduce(function(x, y) merge(x, y, by = "ID"), data_2)	
  
  formulla_1 <- as.formula ( paste("Surv(fu_time, dead)", paste( paste0(colnames(covar), collapse = "+"), paste0(colnames(x_1), collapse = "+"), sep="+"), sep="~"))
  formulla_2 <- as.formula ( paste("Surv(fu_time, dead)", paste( paste0(colnames(covar), collapse = "+"), paste0(colnames(x_2), collapse = "+"), sep="+"), sep="~"))
  
  fitAalen_1 <- summary(aareg(formulla, data =Data_1, nmin = 2) )$table
  fitAalen_2 <- summary(aareg(formulla, data =Data_2, nmin = 2) )$table
  
  res_1 <- data.frame("class" = rep(i, nrow(fitAalen_1)), "phe_id" = rownames(fitAalen_1), "chr" = rep(i, nrow(fitAalen_1)), fitAalen_1)
  res_2 <- data.frame("class" = rep(i, nrow(fitAalen_2)), "phe_id" = rownames(fitAalen_2), "chr" = rep(i, nrow(fitAalen_2)), fitAalen_2)
  
  rownames(res_1) <- NULL
  rownames(res_2) <- NULL
  
  Res_1[[i]] <- res_1[-c(1:18), ]
  Res_2[[i]] <- res_2[-c(1:18), ]
  
}

Result_1 <- do.call(rbind.data.frame, Res_1)
Result_2 <- do.call(rbind.data.frame, Res_2)

##############################################################################
##########        Identifying  IVs
##############################################################################

# SNP:  genotype data 
# covar_outcomes: includes covariates and outcomes for survival analysis

res <- apply(SNP, 2, fit_snp_os, covar_outcomes)
Res <- t(res)
colnames(Res) <- c( "coef", "exp_coef", "se", "z", "p_val", "lower_CI_95", "upper_CI_95")

Res_selected <- Res[!Res$p_val < 0.0001,  ]


################ 
# no_genes: genes  with eQTL and potential IVs
# Q[[i]] is an n×p_i matrix of IVs in the ith gene    
# W[[i]] is a vector of the estimated coefficients from the eQTL analysis. 

for ( in in 1: no_genes)
{
  g[[i]] <- Q[[i]] %in% W[[i]]
  
}
g[[(i+1)]] <- covar_outcomes[ , colnames("fu_time", "dead")]
G <- do.call(cbind, g)


formulla <- as.formula ( paste("Surv(fu_time, dead)", paste0(colnames(G), collapse = "+"), sep = "~"))

fitAalen <- summary(aareg(formulla, data = G) )$table